{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "3eeee95f-e4f2-4052-94fb-a5dc8ab542ae",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/tomazbratanic/anaconda3/lib/python3.11/site-packages/pandas/core/arrays/masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).\n",
      "  from pandas.core import (\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "from neo4j import GraphDatabase\n",
    "import time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "b6c15443-4acb-4f91-88ea-4e08abaa4c29",
   "metadata": {},
   "outputs": [],
   "source": [
    "NEO4J_URI=\"bolt://localhost\"\n",
    "NEO4J_USERNAME=\"neo4j\"\n",
    "NEO4J_PASSWORD=\"password\"\n",
    "\n",
    "driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "d787bf7b-ac9b-4bfb-b140-a50a3fd205c5",
   "metadata": {},
   "outputs": [],
   "source": [
    "def batched_import(statement, df, batch_size=1000):\n",
    "    total = len(df)\n",
    "    start_s = time.time()\n",
    "    for start in range(0,total, batch_size):\n",
    "        batch = df.iloc[start: min(start+batch_size,total)]\n",
    "        result = driver.execute_query(\"UNWIND $rows AS value \" + statement, \n",
    "                                      rows=batch.to_dict('records'))\n",
    "        print(result.summary.counters)\n",
    "    print(f'{total} rows in { time.time() - start_s} s.')    \n",
    "    return total"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "ed7f212e-9148-424c-adc6-d81db9f8e5a5",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "create constraint chunk_id if not exists for (c:__Chunk__) require c.id is unique\n",
      "\n",
      "create constraint document_id if not exists for (d:__Document__) require d.id is unique\n",
      "\n",
      "create constraint entity_id if not exists for (c:__Community__) require c.community is unique\n",
      "\n",
      "create constraint entity_id if not exists for (e:__Entity__) require e.id is unique\n",
      "\n",
      "create constraint entity_title if not exists for (e:__Entity__) require e.title is unique\n",
      "\n",
      "create constraint entity_title if not exists for (e:__Covariate__) require e.title is unique\n",
      "\n",
      "create constraint related_id if not exists for ()-[rel:RELATED]->() require rel.id is unique\n"
     ]
    }
   ],
   "source": [
    "# create constraints\n",
    "\n",
    "statements = \"\"\"\n",
    "create constraint chunk_id if not exists for (c:__Chunk__) require c.id is unique;\n",
    "create constraint document_id if not exists for (d:__Document__) require d.id is unique;\n",
    "create constraint entity_id if not exists for (c:__Community__) require c.community is unique;\n",
    "create constraint entity_id if not exists for (e:__Entity__) require e.id is unique;\n",
    "create constraint entity_title if not exists for (e:__Entity__) require e.title is unique;\n",
    "create constraint entity_title if not exists for (e:__Covariate__) require e.title is unique;\n",
    "create constraint related_id if not exists for ()-[rel:RELATED]->() require rel.id is unique;\n",
    "\"\"\".split(\";\")\n",
    "\n",
    "for s in statements:\n",
    "    if len((s or \"\").strip()) > 0:\n",
    "        print(s)\n",
    "        driver.execute_query(query_=s)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "5ec93c92-499d-4ec6-bf3b-c34f74552600",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>text</th>\n",
       "      <th>n_tokens</th>\n",
       "      <th>document_ids</th>\n",
       "      <th>entity_ids</th>\n",
       "      <th>relationship_ids</th>\n",
       "      <th>covariate_ids</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2cf7a230c367a2dfaf0fc3c903eb8948</td>\n",
       "      <td># Operation: Dulce\\n\\n## Chapter 1\\n\\nThe thru...</td>\n",
       "      <td>2500</td>\n",
       "      <td>[958fdd043f17ade63cb13570b59df295]</td>\n",
       "      <td>[b45241d70f0e43fca764df95b2b81f77, 4119fd06010...</td>\n",
       "      <td>[b35c3d1a7daa4924b6bdb58bc69c354d, a97e2ecd870...</td>\n",
       "      <td>[ad5a2020-cdec-4982-acdf-dbe5ee530066, 9d8a0fe...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>6d1255303acb7c9dc951cb0f5fc3042c</td>\n",
       "      <td>be the same.\\n\\n\\*\\n\\nThe sense of foreboding...</td>\n",
       "      <td>2500</td>\n",
       "      <td>[958fdd043f17ade63cb13570b59df295]</td>\n",
       "      <td>[b45241d70f0e43fca764df95b2b81f77, 4119fd06010...</td>\n",
       "      <td>[b35c3d1a7daa4924b6bdb58bc69c354d, a97e2ecd870...</td>\n",
       "      <td>[5d1c9126-c48d-4755-9f9c-f739c823f95f, ec64a42...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                 id  \\\n",
       "0  2cf7a230c367a2dfaf0fc3c903eb8948   \n",
       "1  6d1255303acb7c9dc951cb0f5fc3042c   \n",
       "\n",
       "                                                text  n_tokens  \\\n",
       "0  # Operation: Dulce\\n\\n## Chapter 1\\n\\nThe thru...      2500   \n",
       "1   be the same.\\n\\n\\*\\n\\nThe sense of foreboding...      2500   \n",
       "\n",
       "                         document_ids  \\\n",
       "0  [958fdd043f17ade63cb13570b59df295]   \n",
       "1  [958fdd043f17ade63cb13570b59df295]   \n",
       "\n",
       "                                          entity_ids  \\\n",
       "0  [b45241d70f0e43fca764df95b2b81f77, 4119fd06010...   \n",
       "1  [b45241d70f0e43fca764df95b2b81f77, 4119fd06010...   \n",
       "\n",
       "                                    relationship_ids  \\\n",
       "0  [b35c3d1a7daa4924b6bdb58bc69c354d, a97e2ecd870...   \n",
       "1  [b35c3d1a7daa4924b6bdb58bc69c354d, a97e2ecd870...   \n",
       "\n",
       "                                       covariate_ids  \n",
       "0  [ad5a2020-cdec-4982-acdf-dbe5ee530066, 9d8a0fe...  \n",
       "1  [5d1c9126-c48d-4755-9f9c-f739c823f95f, ec64a42...  "
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "text_df = pd.read_parquet('create_final_text_units.parquet')\n",
    "text_df.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "ffd3d380-8710-46f5-b90a-04ed8482192c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'_contains_updates': True, 'labels_added': 13, 'relationships_created': 12, 'nodes_created': 13, 'properties_set': 37}\n",
      "12 rows in 0.08599472045898438 s.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "12"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "statement = \"\"\"\n",
    "MERGE (n:__Chunk__ {id:value.id})\n",
    "SET n += value {.text, .n_tokens}\n",
    "WITH n, value\n",
    "UNWIND value.document_ids AS document\n",
    "MERGE (d:__Document__ {id:document})\n",
    "MERGE (n)-[:PART_OF_DOCUMENT]->(d)\n",
    "\"\"\"\n",
    "batched_import(statement, text_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "140b420e-045e-4c71-9f25-1a20c5b528bd",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>name</th>\n",
       "      <th>type</th>\n",
       "      <th>description</th>\n",
       "      <th>human_readable_id</th>\n",
       "      <th>graph_embedding</th>\n",
       "      <th>text_unit_ids</th>\n",
       "      <th>description_embedding</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>b45241d70f0e43fca764df95b2b81f77</td>\n",
       "      <td>ALEX MERCER</td>\n",
       "      <td>PERSON</td>\n",
       "      <td>Alex Mercer is a character with a military bac...</td>\n",
       "      <td>0</td>\n",
       "      <td>None</td>\n",
       "      <td>[00fafabae48948779fee2afe600f5143, 1e433d6b308...</td>\n",
       "      <td>[0.009358493611216545, -0.02407047711312771, -...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>4119fd06010c494caa07f439b333f4c5</td>\n",
       "      <td>TAYLOR CRUZ</td>\n",
       "      <td>PERSON</td>\n",
       "      <td>Taylor Cruz is a character who plays a pivotal...</td>\n",
       "      <td>1</td>\n",
       "      <td>None</td>\n",
       "      <td>[00fafabae48948779fee2afe600f5143, 1e433d6b308...</td>\n",
       "      <td>[0.0020127426832914352, -0.027186712250113487,...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                 id         name    type  \\\n",
       "0  b45241d70f0e43fca764df95b2b81f77  ALEX MERCER  PERSON   \n",
       "1  4119fd06010c494caa07f439b333f4c5  TAYLOR CRUZ  PERSON   \n",
       "\n",
       "                                         description  human_readable_id  \\\n",
       "0  Alex Mercer is a character with a military bac...                  0   \n",
       "1  Taylor Cruz is a character who plays a pivotal...                  1   \n",
       "\n",
       "  graph_embedding                                      text_unit_ids  \\\n",
       "0            None  [00fafabae48948779fee2afe600f5143, 1e433d6b308...   \n",
       "1            None  [00fafabae48948779fee2afe600f5143, 1e433d6b308...   \n",
       "\n",
       "                               description_embedding  \n",
       "0  [0.009358493611216545, -0.02407047711312771, -...  \n",
       "1  [0.0020127426832914352, -0.027186712250113487,...  "
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "entity_df = pd.read_parquet('create_final_entities.parquet')\n",
    "entity_df.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "1d038114-0714-48ee-a48a-c421cd539661",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'_contains_updates': True, 'labels_added': 217, 'relationships_created': 307, 'nodes_created': 217, 'properties_set': 1085}\n",
      "217 rows in 0.37180399894714355 s.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "217"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "entity_statement = \"\"\"\n",
    "MERGE (n:__Entity__ {id:value.id})\n",
    "SET n += value {.human_readable_id, .description, name:replace(value.name,'\"',''), .description_embedding}\n",
    "WITH n, value\n",
    "CALL apoc.create.addLabels(n, case when value.type is null OR value.type = \"\" then [] else [apoc.text.upperCamelCase(replace(value.type,'\"',''))] end) yield node\n",
    "UNWIND value.text_unit_ids AS text_unit\n",
    "MERGE (c:__Chunk__ {id:text_unit})\n",
    "MERGE (c)-[:MENTIONS]->(n)\n",
    "RETURN count(*)\n",
    "\"\"\"\n",
    "batched_import(entity_statement, entity_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "5e713603-c508-4964-ba49-474e4867b747",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>source</th>\n",
       "      <th>target</th>\n",
       "      <th>weight</th>\n",
       "      <th>description</th>\n",
       "      <th>text_unit_ids</th>\n",
       "      <th>id</th>\n",
       "      <th>human_readable_id</th>\n",
       "      <th>source_degree</th>\n",
       "      <th>target_degree</th>\n",
       "      <th>rank</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>ALEX MERCER</td>\n",
       "      <td>TAYLOR CRUZ</td>\n",
       "      <td>7.0</td>\n",
       "      <td>Alex Mercer and Taylor Cruz are integral membe...</td>\n",
       "      <td>[00fafabae48948779fee2afe600f5143, 1e433d6b308...</td>\n",
       "      <td>b35c3d1a7daa4924b6bdb58bc69c354d</td>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "      <td>12</td>\n",
       "      <td>21</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>ALEX MERCER</td>\n",
       "      <td>TAYLOR CRUZ</td>\n",
       "      <td>7.0</td>\n",
       "      <td>Alex Mercer and Taylor Cruz are integral membe...</td>\n",
       "      <td>[00fafabae48948779fee2afe600f5143, 1e433d6b308...</td>\n",
       "      <td>b35c3d1a7daa4924b6bdb58bc69c354d</td>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "      <td>12</td>\n",
       "      <td>21</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        source       target  weight  \\\n",
       "0  ALEX MERCER  TAYLOR CRUZ     7.0   \n",
       "1  ALEX MERCER  TAYLOR CRUZ     7.0   \n",
       "\n",
       "                                         description  \\\n",
       "0  Alex Mercer and Taylor Cruz are integral membe...   \n",
       "1  Alex Mercer and Taylor Cruz are integral membe...   \n",
       "\n",
       "                                       text_unit_ids  \\\n",
       "0  [00fafabae48948779fee2afe600f5143, 1e433d6b308...   \n",
       "1  [00fafabae48948779fee2afe600f5143, 1e433d6b308...   \n",
       "\n",
       "                                 id human_readable_id  source_degree  \\\n",
       "0  b35c3d1a7daa4924b6bdb58bc69c354d                 0              9   \n",
       "1  b35c3d1a7daa4924b6bdb58bc69c354d                 0              9   \n",
       "\n",
       "   target_degree  rank  \n",
       "0             12    21  \n",
       "1             12    21  "
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rel_df = pd.read_parquet('create_final_relationships.parquet')\n",
    "rel_df.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "27900c01-89e1-4dec-9d5c-c07317c68baf",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'_contains_updates': True, 'relationships_created': 69, 'properties_set': 1449}\n",
      "276 rows in 0.1078798770904541 s.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "276"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rel_statement = \"\"\"\n",
    "    MATCH (source:__Entity__ {name:replace(value.source,'\"','')})\n",
    "    MATCH (target:__Entity__ {name:replace(value.target,'\"','')})\n",
    "    // not necessary to merge on id as there is only one relationship per pair\n",
    "    MERGE (source)-[rel:RELATED {id: value.id}]->(target)\n",
    "    SET rel += value {.rank, .weight, .human_readable_id, .description, .text_unit_ids}\n",
    "    RETURN count(*) as createdRels\n",
    "\"\"\"\n",
    "batched_import(rel_statement, rel_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "1be9e7a9-69ee-406b-bce5-95a9c41ecffe",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>community</th>\n",
       "      <th>full_content</th>\n",
       "      <th>level</th>\n",
       "      <th>rank</th>\n",
       "      <th>title</th>\n",
       "      <th>rank_explanation</th>\n",
       "      <th>summary</th>\n",
       "      <th>findings</th>\n",
       "      <th>full_content_json</th>\n",
       "      <th>id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>4</td>\n",
       "      <td># Dulce Base and the Paranormal Military Squad...</td>\n",
       "      <td>1</td>\n",
       "      <td>8.5</td>\n",
       "      <td>Dulce Base and the Paranormal Military Squad: ...</td>\n",
       "      <td>The impact severity rating is high due to the ...</td>\n",
       "      <td>The community is centered around Dulce Base, a...</td>\n",
       "      <td>[{'explanation': 'Dulce Base is the primary lo...</td>\n",
       "      <td>{\\n    \"title\": \"Dulce Base and the Paranormal...</td>\n",
       "      <td>6f8ba6b6-506e-46c1-83ce-982d59622554</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>5</td>\n",
       "      <td># Sam Rivera and the Paranormal Military Squad...</td>\n",
       "      <td>1</td>\n",
       "      <td>7.5</td>\n",
       "      <td>Sam Rivera and the Paranormal Military Squad a...</td>\n",
       "      <td>The impact severity rating is high due to the ...</td>\n",
       "      <td>The community is centered around Sam Rivera, a...</td>\n",
       "      <td>[{'explanation': 'Sam Rivera is recognized for...</td>\n",
       "      <td>{\\n    \"title\": \"Sam Rivera and the Paranormal...</td>\n",
       "      <td>418f4536-d673-4212-8a7c-ca1aac547d0f</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  community                                       full_content  level  rank  \\\n",
       "0         4  # Dulce Base and the Paranormal Military Squad...      1   8.5   \n",
       "1         5  # Sam Rivera and the Paranormal Military Squad...      1   7.5   \n",
       "\n",
       "                                               title  \\\n",
       "0  Dulce Base and the Paranormal Military Squad: ...   \n",
       "1  Sam Rivera and the Paranormal Military Squad a...   \n",
       "\n",
       "                                    rank_explanation  \\\n",
       "0  The impact severity rating is high due to the ...   \n",
       "1  The impact severity rating is high due to the ...   \n",
       "\n",
       "                                             summary  \\\n",
       "0  The community is centered around Dulce Base, a...   \n",
       "1  The community is centered around Sam Rivera, a...   \n",
       "\n",
       "                                            findings  \\\n",
       "0  [{'explanation': 'Dulce Base is the primary lo...   \n",
       "1  [{'explanation': 'Sam Rivera is recognized for...   \n",
       "\n",
       "                                   full_content_json  \\\n",
       "0  {\\n    \"title\": \"Dulce Base and the Paranormal...   \n",
       "1  {\\n    \"title\": \"Sam Rivera and the Paranormal...   \n",
       "\n",
       "                                     id  \n",
       "0  6f8ba6b6-506e-46c1-83ce-982d59622554  \n",
       "1  418f4536-d673-4212-8a7c-ca1aac547d0f  "
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "community_df = pd.read_parquet('create_final_community_reports.parquet')\n",
    "community_df.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "5c6ed591-f98c-4403-9fde-8d4cb4c01cca",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'_contains_updates': True, 'labels_added': 37, 'relationships_created': 31, 'nodes_created': 37, 'properties_set': 110}\n",
      "6 rows in 0.05302619934082031 s.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "6"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# import communities\n",
    "# Run only once / not idempotent\n",
    "community_statement = \"\"\"\n",
    "MERGE (c:__Community__ {id:value.id})\n",
    "SET c += value {.community, .level, .title, .rank, .rank_explanation, .full_content, .summary}\n",
    "WITH c, value\n",
    "UNWIND value.findings AS finding\n",
    "CREATE (c)-[:HAS_FINDING]->(f:Finding)\n",
    "SET f += finding\n",
    "\"\"\"\n",
    "batched_import(community_statement, community_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "523bed92-d12c-4fc4-aa44-6c62321b36bc",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>human_readable_id</th>\n",
       "      <th>covariate_type</th>\n",
       "      <th>type</th>\n",
       "      <th>description</th>\n",
       "      <th>subject_id</th>\n",
       "      <th>subject_type</th>\n",
       "      <th>object_id</th>\n",
       "      <th>object_type</th>\n",
       "      <th>status</th>\n",
       "      <th>start_date</th>\n",
       "      <th>end_date</th>\n",
       "      <th>source_text</th>\n",
       "      <th>text_unit_id</th>\n",
       "      <th>document_ids</th>\n",
       "      <th>n_tokens</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>ad5a2020-cdec-4982-acdf-dbe5ee530066</td>\n",
       "      <td>1</td>\n",
       "      <td>claim</td>\n",
       "      <td>MISSION INVOLVEMENT</td>\n",
       "      <td>Agent Alex Mercer's compliance in the briefing...</td>\n",
       "      <td>AGENT ALEX MERCER</td>\n",
       "      <td>None</td>\n",
       "      <td>NONE</td>\n",
       "      <td>None</td>\n",
       "      <td>SUSPECTED</td>\n",
       "      <td>NONE</td>\n",
       "      <td>NONE</td>\n",
       "      <td>\"With dulled eyes, he scanned the projectors o...</td>\n",
       "      <td>2cf7a230c367a2dfaf0fc3c903eb8948</td>\n",
       "      <td>[958fdd043f17ade63cb13570b59df295]</td>\n",
       "      <td>2500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>9d8a0fe5-07b7-4b1a-b5be-1317d0fac005</td>\n",
       "      <td>2</td>\n",
       "      <td>claim</td>\n",
       "      <td>AUTHORITY EXERCISE</td>\n",
       "      <td>Agent Taylor Cruz exercises authority and dema...</td>\n",
       "      <td>AGENT TAYLOR CRUZ</td>\n",
       "      <td>None</td>\n",
       "      <td>NONE</td>\n",
       "      <td>None</td>\n",
       "      <td>TRUE</td>\n",
       "      <td>NONE</td>\n",
       "      <td>NONE</td>\n",
       "      <td>\"It was Taylor Cruz’s voice, laced with an edg...</td>\n",
       "      <td>2cf7a230c367a2dfaf0fc3c903eb8948</td>\n",
       "      <td>[958fdd043f17ade63cb13570b59df295]</td>\n",
       "      <td>2500</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                     id human_readable_id covariate_type  \\\n",
       "0  ad5a2020-cdec-4982-acdf-dbe5ee530066                 1          claim   \n",
       "1  9d8a0fe5-07b7-4b1a-b5be-1317d0fac005                 2          claim   \n",
       "\n",
       "                  type                                        description  \\\n",
       "0  MISSION INVOLVEMENT  Agent Alex Mercer's compliance in the briefing...   \n",
       "1   AUTHORITY EXERCISE  Agent Taylor Cruz exercises authority and dema...   \n",
       "\n",
       "          subject_id subject_type object_id object_type     status start_date  \\\n",
       "0  AGENT ALEX MERCER         None      NONE        None  SUSPECTED       NONE   \n",
       "1  AGENT TAYLOR CRUZ         None      NONE        None       TRUE       NONE   \n",
       "\n",
       "  end_date                                        source_text  \\\n",
       "0     NONE  \"With dulled eyes, he scanned the projectors o...   \n",
       "1     NONE  \"It was Taylor Cruz’s voice, laced with an edg...   \n",
       "\n",
       "                       text_unit_id                        document_ids  \\\n",
       "0  2cf7a230c367a2dfaf0fc3c903eb8948  [958fdd043f17ade63cb13570b59df295]   \n",
       "1  2cf7a230c367a2dfaf0fc3c903eb8948  [958fdd043f17ade63cb13570b59df295]   \n",
       "\n",
       "   n_tokens  \n",
       "0      2500  \n",
       "1      2500  "
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cov_df = pd.read_parquet('create_final_covariates.parquet')\n",
    "cov_df.head(2)\n",
    "# Subject id do not match entity ids"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "3e064234-5fce-448e-8bb4-ab2f35699049",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'_contains_updates': True, 'labels_added': 89, 'relationships_created': 89, 'nodes_created': 89, 'properties_set': 1061}\n",
      "89 rows in 0.13370895385742188 s.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "89"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# import covariates\n",
    "cov_statement = \"\"\"\n",
    "MERGE (c:__Covariate__ {id:value.id})\n",
    "SET c += apoc.map.clean(value, [\"text_unit_id\", \"document_ids\", \"n_tokens\"], [Null, \"\"])\n",
    "WITH c, value\n",
    "MATCH (ch:__Chunk__ {id: value.text_unit_id})\n",
    "MERGE (ch)-[:HAS_COVARIATE]->(c)\n",
    "\"\"\"\n",
    "batched_import(cov_statement, cov_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "fc9f6606-0cce-4f28-9d88-eaf894d8110b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>level</th>\n",
       "      <th>title</th>\n",
       "      <th>type</th>\n",
       "      <th>description</th>\n",
       "      <th>source_id</th>\n",
       "      <th>community</th>\n",
       "      <th>degree</th>\n",
       "      <th>human_readable_id</th>\n",
       "      <th>id</th>\n",
       "      <th>size</th>\n",
       "      <th>graph_embedding</th>\n",
       "      <th>entity_type</th>\n",
       "      <th>top_level_node_id</th>\n",
       "      <th>x</th>\n",
       "      <th>y</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>ALEX MERCER</td>\n",
       "      <td>PERSON</td>\n",
       "      <td>Alex Mercer is a character with a military bac...</td>\n",
       "      <td>00fafabae48948779fee2afe600f5143,1e433d6b30887...</td>\n",
       "      <td>1</td>\n",
       "      <td>9</td>\n",
       "      <td>0</td>\n",
       "      <td>b45241d70f0e43fca764df95b2b81f77</td>\n",
       "      <td>9</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>b45241d70f0e43fca764df95b2b81f77</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>217</th>\n",
       "      <td>1</td>\n",
       "      <td>ALEX MERCER</td>\n",
       "      <td>PERSON</td>\n",
       "      <td>Alex Mercer is a character with a military bac...</td>\n",
       "      <td>00fafabae48948779fee2afe600f5143,1e433d6b30887...</td>\n",
       "      <td>4</td>\n",
       "      <td>9</td>\n",
       "      <td>0</td>\n",
       "      <td>b45241d70f0e43fca764df95b2b81f77</td>\n",
       "      <td>9</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>b45241d70f0e43fca764df95b2b81f77</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     level        title    type  \\\n",
       "0        0  ALEX MERCER  PERSON   \n",
       "217      1  ALEX MERCER  PERSON   \n",
       "\n",
       "                                           description  \\\n",
       "0    Alex Mercer is a character with a military bac...   \n",
       "217  Alex Mercer is a character with a military bac...   \n",
       "\n",
       "                                             source_id community  degree  \\\n",
       "0    00fafabae48948779fee2afe600f5143,1e433d6b30887...         1       9   \n",
       "217  00fafabae48948779fee2afe600f5143,1e433d6b30887...         4       9   \n",
       "\n",
       "     human_readable_id                                id  size  \\\n",
       "0                    0  b45241d70f0e43fca764df95b2b81f77     9   \n",
       "217                  0  b45241d70f0e43fca764df95b2b81f77     9   \n",
       "\n",
       "    graph_embedding entity_type                 top_level_node_id  x  y  \n",
       "0              None        None  b45241d70f0e43fca764df95b2b81f77  0  0  \n",
       "217            None        None  b45241d70f0e43fca764df95b2b81f77  0  0  "
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nodes_df = pd.read_parquet('create_final_nodes.parquet')\n",
    "nodes_df[nodes_df['title'] == 'ALEX MERCER']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "47bb6f5c-4c1c-4849-8f1a-cb76fa98b925",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "community\n",
       "1    14\n",
       "2     9\n",
       "4     9\n",
       "0     6\n",
       "5     5\n",
       "3     3\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nodes_df.community.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "dde636a4-a876-4d30-b1a2-8124023c14ed",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{}\n",
      "46 rows in 0.06763219833374023 s.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "46"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Connect nodes to first level community\n",
    "first_df = nodes_df[nodes_df['community'].notna()]\n",
    "first_statement = \"\"\"\n",
    "MATCH (c:__Entity__ {name:replace(value.title,'\"','')})\n",
    "MATCH (c1:__Community__ {community: value.community})\n",
    "MERGE (c)-[:IN_COMMUNITY]->(c1)\n",
    "RETURN count(distinct c1)\n",
    "\"\"\"\n",
    "batched_import(first_statement, first_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7131f3a0-2b71-4017-9dcd-24913d964dc0",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
